# Cleaning up SAM output 
library(pacman)
p_load(
  here, data.table, fst, stringr, collapse, ggplot2, sf, tigris, 
  lutz, dplyr, lubridate
)

# Load the SAM output
sam_raw_dt = fread(here(
  "Data/electricity-generation/sam/intermediate_output/results_county.csv"
))
sam_raw_dt[,hour := .I]
# Load crosswalk to counties 
log_dt = fread(here("Data/electricity-generation/sam/nsrb-log.csv"))
county_dt =fread(here('Data/electricity-generation/sam/county-dt.csv'))

# Turn into long table 
sam_dt = 
  melt(
    sam_raw_dt, 
    id.vars = 'hour', 
    value.name = 'elec_gen'
  )[,.(
    hour, 
    nsrb_id = str_extract(variable, '^\\d*(?=_)'), 
    elec_gen
  )] |>
  join(
    log_dt, 
    on = 'nsrb_id', 
    how = 'left'
  )
# Calculate total for each year to get profile
sam_dt[,
  tot_elec := sum(elec_gen),
  keyby = .(geoid, nsrb_id)
][,
  elec_profile := elec_gen/tot_elec
] 

# Converting into UTC time using timezone of centroid
county_dt[,
  tz := tz_lookup_coords(lat = lat, lon = lon, method = "accurate")
]
setkey(county_dt, geoid)
# Conversion of local hour to utc for each timezone
# This is a slow calculation...
tz_dt = 
  join(
    sam_dt,
    county_dt[,.(geoid, tz)],
    on = 'geoid', 
    how = 'left'
  )[,.(tz, hour)] |> 
  unique() |>
  rowwise() |>
  mutate(
    local_time = ymd('20190101', tz = tz) + hours(hour-1),
    utc_time = with_tz(local_time, "UTC")
  ) |>
  data.table()
# Dealing with year mismatch for local to utc conversion 
tz_dt[,utc_time_new := fcase(
  year(utc_time) == 2019, utc_time,
  year(utc_time) == 2018, utc_time + years(1),
  year(utc_time) == 2020, utc_time - years(1)
)]
# Adding to whole dataset
solar_profile_dt = 
  join(
    sam_dt,
    county_dt[,.(geoid, tz)],
    on = 'geoid', 
    how = 'left'
  ) |>
  join(  
    tz_dt, 
    on = c('tz', 'hour')
  )
setnames(
  solar_profile_dt,
  old = c('utc_time', 'utc_time_new'),
  new = c('utc_time_orig','utc_time')
)
solar_profile_dt |> setkey(geoid, utc_time)

# Different time zones start/end at different utc times
solar_profile_dt[,.N,keyby = utc_time][,.N,keyby = N]
# Daylight savings time messes up one hour
solar_profile_dt[is.na(utc_time),.N,keyby = hour]
solar_profile_dt[is.na(utc_time),.N,keyby = geoid][,.N,by = N]
# Turning geoid's into strings
solar_profile_dt[,geoid := str_pad(geoid, side = 'left', width = 5, pad = '0')]
county_dt[,geoid := str_pad(geoid, side = 'left', width = 5, pad = '0')]
# Table with every hour for every county 
all_solar_profile_dt = 
  join(
    CJ(
      geoid = county_dt$geoid,
      utc_time = seq(
        ymd_hms('2019-01-01 00:00:00'),
        ymd_hms('2019-12-31 23:00:00'), 
        by = 'hour'
      )
    ),
    solar_profile_dt,
    on = c('geoid','utc_time'),
    how = 'left'
  ) |>
  join(
    solar_profile_dt[
      is.na(utc_time),.(
        geoid, 
        elec_gen_dst = elec_gen,
        tot_elec_dst = tot_elec,
        elec_profile_dst = elec_profile
    )],
    on = 'geoid',
    how = 'left'
  ) %>% 
  .[,.(
    geoid, utc_time, 
    elec_gen = ifelse(is.na(elec_gen), elec_gen_dst, elec_gen), 
    tot_elec = ifelse(is.na(tot_elec), tot_elec_dst, tot_elec), 
    elec_profile = ifelse(is.na(elec_profile), elec_profile_dst, elec_profile)
  )]

# Saving results
fwrite(
  all_solar_profile_dt, 
  here('Data/electricity-generation/output/solar-profile-county-dt.csv')
)

# Checking elec_profile sums to 1
# tmp = 
#   all_solar_profile_dt |>
#   gby(geoid) |>
#   fsum()
# tmp[abs(elec_profile - 1) > 0.000001]
# all_solar_profile_dt[,.N,keyby = geoid][,.N,keyby = N]
# all_solar_profile_dt[is.na(elec_profile),.N,keyby = geoid]
# google_dt[,
#   fips := str_pad(region_name, side = 'left', width = 11, pad = '0')
# ][,geoid := str_sub(fips,1,5)]
# google_dt |>setkey(geoid)
# tmp = 
#   join(
#     google_dt, 
#     all_solar_profile_dt[!is.na(elec_profile),.N,keyby = geoid], 
#     on = 'geoid', 
#     how = 'left'
#   )
# Only missing alaska
# tmp[,mean(is.na(N)),keyby = .(state_name)][V1 > 0]


# Making figure for the appendix with a few examples
  options(tigris_use_cache = TRUE)
  states_sf = 
    states(year = 2015) |>
    clean_names()
  # County shapes
  county_sf =
    map_dfr(
      states_sf$statefp,
      counties,
      year = 2015
    ) |> 
    clean_names() |>
    st_centroid()
# Picking six random counties from following states: 
set.seed(218)
sample_dt = 
  county_sf |>
  filter(statefp %in% c('04','06','12','36','18','41')) |>
  group_by(statefp) |>
  sample_n(1) |>
  collapse::join(
    data.table(states_sf)[,.(statefp, stusps, state_name = name)], 
    on = 'statefp'
  ) |>
  data.table() %>%
  .[,.(geoid, name = paste0(namelsad, ', ', stusps))] %>%
  setkey(geoid)
solar_profile_dt[,geoid := str_pad(geoid, side = 'left', width = 5, pad = '0')]

ex_profiles_p = 
  join(
    solar_profile_dt[hour != 1635,.(
      elec_gen = mean(elec_gen),
      elec_profile = mean(elec_profile)), 
      keyby = .(
        geoid, 
        hour = hour(local_time), 
        month = month(local_time, label = TRUE)
      )
    ],
    sample_dt,
    on = 'geoid',
    how = 'inner'
  )|>
    ggplot(aes(
      x = hour, 
      y = elec_profile, 
      color = month
    )) +
    geom_line()+
    scale_color_viridis_d("Month", option = 'magma', end = 0.8) +
    facet_wrap(~name) + 
    scale_y_continuous(labels = scales::label_percent()) +
    labs(
      x = "Hour of Day",
      y = "Avg Percent of Annual Solar Generation"
    ) +
    theme_minimal()
ggsave(
  plot = ex_profiles_p, 
  filename = here('figures/electricity-generation/example-county-profiles.jpeg'),
  width = 10*0.6, height = 7*0.6,
  bg = 'white'
)